infn = "../data/Int Hypox Invitro and Human Meta Data.csv"
metadata = read.table(infn, sep=",", header=TRUE, stringsAsFactors=FALSE)
Here we make the coding for the variables easier to work with by making them smaller and removing spaces and other problem-causing characters. We will also make an ID that has the important metadata in it, to make plots easier to interpet.
date2month = function(str) {
month = months(as.Date(str, format="%m.%d.%y"))
return(substring(month, 1, 3))
}
metadata$month = as.factor(unlist(lapply(metadata$Experiment.Date, date2month)))
metadata$line = as.factor(gsub(" ", "", metadata$Cell.Line))
metadata$spiked = as.factor(ifelse(grepl("No", metadata$Spike.In ), "nospike",
"spike"))
metadata$hypoxia = as.factor(ifelse(grepl("Intermittent", metadata$Condition), "IH",
ifelse(grepl("Sustained", metadata$Condition), "SH",
"NO")))
metadata$hypoxia = relevel(metadata$hypoxia, ref="NO")
metadata$filename = metadata$RCC.FILE.NAME
metadata$fullname = metadata$Full.Sample.Name
metadata$column = ifelse(grepl("Oligo", metadata$Column), "oligo",
ifelse(grepl("Reg", metadata$Column), "reg", "nocol"))
metadata$replicate = unlist(lapply(metadata$Replicate.Number,
function(x) strsplit(as.character(x), "-")[[1]][2]))
metadata$id = as.character(paste(metadata$line, metadata$hypoxia, metadata$spiked,
metadata$column, metadata$month, sep="_"))
metadata$id = ifelse(grepl("Blank", metadata$id), metadata$Full.Sample.Name,
metadata$id)
metadata$linehypo = paste(metadata$line, metadata$hypoxia, sep="_")
Now we are ready to roll and can read in the RCC files using the sweet NanoStringQCPro library. We’ll just read it in, extract the counts and then actually do the analysis with DESeq2.
library(NanoStringQCPro)
##
rccFiles = paste("../data/rcc/", metadata[,"filename"], sep="")
#blanks = c("20160602_ES052616miRNAV3aC2_Blank_11.RCC", "20160602_ES052616miRNAV3aC2_Blank_12.RCC")
eset = newRccSet(rccFiles=rccFiles, blankLabel="Blank")
## Reading RCC files...
## checkRccSet() messages:
## Optional featureData cols not found: BarCode, ProbeID, TargetSeq, Species, Comments
## The following panel housekeeping genes were found: B2M|0, GAPDH|0, RPL19|0, ACTB|0, RPLP0|0
## Warning in .local(rccSet, ...):
## Non-standard CodeClass values found in featureData (values currently recognized are "Endogenous", "Housekeeping", "Positive", and "Negative"): Ligation, Endogenous1, SpikeIn)
pdata = pData(eset)
pdata$SampleType = metadata$line
pData(eset) <- pdata
proc = preprocRccSet(eset)
counts = as.data.frame(exprs(eset))
colnames(counts) = metadata$id
is_positive = function(column) {
return(grepl("Pos", column))
}
is_negative = function(column) {
return(grepl("Neg", column))
}
is_spikein = function(column) {
return(grepl("Spike", column))
}
is_ligation = function(column) {
return(grepl("Ligati", column))
}
is_housekeeping = function(column) {
return(grepl("Housekee", column))
}
is_prior = function(column) {
return(grepl("miR-210", column) | grepl("miR-365", column))
}
A small set of miRNA have known inflated values; Nanostring has a heuristic to correct for the inflation via the formula using some supplied constants. Here we implement that fix to correct the expression of those miRNA.
correct_miRNA = function(eset) {
require(dplyr)
fdat = fData(eset)
fdat = fdat %>%
tidyr::separate(GeneName, c("Name", "scalefactor"), sep='\\|',
fill="right", remove=FALSE) %>%
dplyr::mutate(scalefactor=ifelse(is.na(scalefactor), 0, scalefactor))
sf = as.numeric(fdat$scalefactor)
posa = as.numeric(counts["Positive_POS_A_ERCC_00117.1",])
scales = t(replicate(length(sf), posa))
scales = scales * sf
scaled = counts - scales
scaled[scaled<0] = 0
rownames(scaled) = fdat$Name
return(round(scaled))
}
plot_corrected_miRNA = function(scaled, counts) {
require(reshape)
require(ggplot2)
rownames(counts) = rownames(scaled)
is_scaled = rowSums(abs(counts - scaled)) > 0
counts = melt(as.matrix(counts[is_scaled,]))
colnames(counts) = c("gene", "sample", "value")
counts$scaled = "no"
scaled = melt(as.matrix(scaled[is_scaled,]))
colnames(scaled) = c("gene", "sample", "value")
scaled$scaled = "yes"
all = rbind(counts, scaled)
library(cowplot)
ggplot(all, aes(sample, value, color=scaled)) + facet_wrap(~gene) +
geom_point(size=0.5) +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
guides(colour = guide_legend(override.aes = list(size=6))) + ylab("") +
ggtitle("miRNA hybridization crosstalk correction")
}
scaled = correct_miRNA(eset)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:Biobase':
##
## combine
## The following objects are masked from 'package:BiocGenerics':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
plot_corrected_miRNA(scaled, counts)
## Loading required package: reshape
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## Loading required package: ggplot2
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
This percentage should be super high, close to 100%. If there isn’t that usually means the cartridge needs to be rescanned. If it is < 3 weeks from the scan, rescanning should be okay. Nanostring folks call < 75% a failure.
library(cowplot)
pdat = pData(eset) %>% left_join(metadata, by=c("FileName"="RCC.FILE.NAME"))
pdat$pcounted = pdat$FovCounted/pdat$FovCount * 100
ggplot(pdat, aes(id, pcounted)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$pcounted))) +
ylab("percentage of FOV counted") + xlab("sample") +
geom_hline(yintercept=75, color="red")
Nanostring folks call a sample problematic if the binding density is < 0.05 or > 2.25.
ggplot(pdat, aes(id, BindingDensity)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(pdat$BindingDensity))) +
ylab("Binding density") + xlab("sample") +
geom_hline(yintercept=0.05, color="red") +
geom_hline(yintercept=2.25, color="red")
Here I plotted the total counts vs the number of miRNA detected, where detected is it has counts > 30. There is a pretty big spread among the samples in terms of the miRNA detected. The Huh7 cells look like they have a lower number of genes detected on average for a given number of reads. This might be a technical artifact or it could also be that there are less miRNA in the Huh7 cells.
endocounts = counts[grepl("Endo", rownames(counts)),]
cdf = data.frame(total=colSums(counts), detected=colSums(counts > 30))
rownames(cdf) = colnames(counts)
cdf$id = rownames(cdf)
cdf = cdf %>% left_join(metadata, by="id")
ggplot(cdf, aes(total, detected, color=column, label=linehypo)) +
geom_text() + scale_x_log10()
Below we look at the R^2 correlation between the expected positive control concentrations and the observed concentrations for each sample.
pcdf = data.frame(concentration=log2(c(128, 32, 8, 2, 0.5, 0.125)),
GeneName=paste("POS", c("A", "B", "C", "D", "E", "F"), sep="_"))
pccounts = subset(exprs(eset), grepl("Positive_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by=c("sample"="RCC.FILE.NAME"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("positive control correlation") +
xlab("positive control correlation")
Here we look at R^2 for the ligation controls as well. It looks like a couple samples failed.
pcdf = data.frame(concentration=log2(c(128, 32, 8)),
GeneName=paste("POS_", c("A", "B", "C"), sep="_"))
pccounts = subset(exprs(eset), grepl("Ligation_LIG_POS", rownames(exprs(eset))))
pccounts = pccounts[sort(rownames(pccounts)),]
rownames(pccounts) = pcdf$GeneName
corsamples = data.frame(correlation=apply(pccounts, 2,
function(x) cor(x, pcdf$concentration)),
sample=colnames(pccounts)) %>%
left_join(metadata, by=c("sample"="RCC.FILE.NAME"))
## Warning in left_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## character vector and factor, coercing into character vector
ggplot(corsamples, aes(sample, correlation)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
scale_y_continuous(expand = c(0,0)) +
expand_limits(y = c(0,1.05 * max(corsamples$correlation))) +
ylab("ligation control correlation") +
xlab("ligation control correlation")
subset(corsamples, correlation < 0.75)$sample
## [1] "20160602_ES052616miRNAV3aC2_Blank_11.RCC"
## [2] "20160602_ES052616miRNAV3aC2_Blank_12.RCC"
Ah, of course, the blanks.
library(ggplot2)
library(dplyr)
library(cowplot)
extract_pred = function(counts, predicate) {
toplot = counts[predicate(rownames(counts)),] %>%
tibble::rownames_to_column() %>%
tidyr::gather("sample", "count", -rowname)
colnames(toplot) = c("spot", "sample", "count")
toplot = toplot %>% left_join(metadata, by=c("sample"="id"))
return(toplot)
}
spotbarplot = function(toplot) {
ggplot(toplot,
aes(sample, count)) + geom_bar(stat='identity') +
facet_wrap(~spot) +
theme(axis.text.x = element_blank(),
text = element_text(size=8))
}
spotboxplot = function(toplot) {
ggplot(toplot,
aes(linehypo, count)) + geom_boxplot() +
facet_wrap(~spot) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
spotbarplot(extract_pred(counts, is_positive))
Negative counts can be relatively high in some of the samples compared to the other samples.
spotbarplot(extract_pred(counts, is_negative))
knitr::kable(subset(extract_pred(counts, is_negative), count > 50))
| spot | sample | count | RCC.FILE.NAME | Full.Sample.Name | Cell.Line | Condition | What | Column | Spike.In | Experiment.Date | Replicate.Number | Date.Shipped | RCC.File.Date | month | line | spiked | hypoxia | filename | fullname | column | replicate | linehypo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | Negative_NEG_C_ERCC_00019.1 | Huh7_IH_nospike_nocol_Jan | 64 | 20150421_PHS 1_01_10.RCC | IH1 | Huh 7 | Intermittent Hypoxia | Huh 7 cells Int Hypox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | IH | 20150421_PHS 1_01_10.RCC | IH1 | nocol | 1 | Huh7_IH |
| 3 | Negative_NEG_E_ERCC_00098.1 | Huh7_IH_nospike_nocol_Jan | 79 | 20150421_PHS 1_01_10.RCC | IH1 | Huh 7 | Intermittent Hypoxia | Huh 7 cells Int Hypox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | IH | 20150421_PHS 1_01_10.RCC | IH1 | nocol | 1 | Huh7_IH |
| 4 | Negative_NEG_A_ERCC_00096.1 | Huh7_IH_nospike_nocol_Jan | 52 | 20150421_PHS 1_01_10.RCC | IH1 | Huh 7 | Intermittent Hypoxia | Huh 7 cells Int Hypox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | IH | 20150421_PHS 1_01_10.RCC | IH1 | nocol | 1 | Huh7_IH |
| 9 | Negative_NEG_C_ERCC_00019.1 | Huh7_NO_nospike_nocol_Jan | 95 | 20150421_PHS 1_01_11.RCC | N1 | Huh 7 | Normoxia | Huh7 Normox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | NO | 20150421_PHS 1_01_11.RCC | N1 | nocol | 1 | Huh7_NO |
| 11 | Negative_NEG_E_ERCC_00098.1 | Huh7_NO_nospike_nocol_Jan | 87 | 20150421_PHS 1_01_11.RCC | N1 | Huh 7 | Normoxia | Huh7 Normox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | NO | 20150421_PHS 1_01_11.RCC | N1 | nocol | 1 | Huh7_NO |
| 17 | Negative_NEG_C_ERCC_00019.1 | Huh7_SH_nospike_nocol_Jan | 86 | 20150421_PHS 1_01_12.RCC | SH1 | Huh 7 | Sustained Hypoxia | Huh7 Sust Hypox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | SH | 20150421_PHS 1_01_12.RCC | SH1 | nocol | 1 | Huh7_SH |
| 19 | Negative_NEG_E_ERCC_00098.1 | Huh7_SH_nospike_nocol_Jan | 82 | 20150421_PHS 1_01_12.RCC | SH1 | Huh 7 | Sustained Hypoxia | Huh7 Sust Hypox | None | No spike in | 1.8.15 | H-1 | 4.6.15 | 4.21.2015 | Jan | Huh7 | nospike | SH | 20150421_PHS 1_01_12.RCC | SH1 | nocol | 1 | Huh7_SH |
Was anything different about those samples?
It looks like there are only some samples that have spike-ins.
spotbarplot(extract_pred(counts, is_spikein))
And those correspond to the spike-ins in the metadata:
knitr::kable(subset(extract_pred(counts, is_spikein), count > 500))
| spot | sample | count | RCC.FILE.NAME | Full.Sample.Name | Cell.Line | Condition | What | Column | Spike.In | Experiment.Date | Replicate.Number | Date.Shipped | RCC.File.Date | month | line | spiked | hypoxia | filename | fullname | column | replicate | linehypo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 18 | SpikeIn_cel-miR-254|0_MIMAT0000310 | U937_NO_spike_nocol_Oct | 4287 | 20151118_Espy-11-18-15-rescan_01_07.RCC | UN | U937 | Normoxia | U937 Normox | None | + Spike In | 10.27.15 | U-1 | 10.28.15 | 11.15.15 | Oct | U937 | spike | NO | 20151118_Espy-11-18-15-rescan_01_07.RCC | UN | nocol | 1 | U937_NO |
| 23 | SpikeIn_cel-miR-254|0_MIMAT0000310 | U937_IH_spike_nocol_Oct | 5390 | 20151118_Espy-11-18-15-rescan_01_08.RCC | UIH | U937 | Intermittent Hypoxia | U937 Int Hypox | None | + Spike In | 10.27.15 | U-1 | 10.28.15 | 11.15.15 | Oct | U937 | spike | IH | 20151118_Espy-11-18-15-rescan_01_08.RCC | UIH | nocol | 1 | U937_IH |
| 28 | SpikeIn_cel-miR-254|0_MIMAT0000310 | U937_SH_spike_nocol_Oct | 5733 | 20151118_Espy-11-18-15-rescan_01_09.RCC | USH | U937 | Sustained Hypoxia | U937 Sust Hypx | None | + Spike In | 10.27.15 | U-1 | 10.28.15 | 11.15.15 | Oct | U937 | spike | SH | 20151118_Espy-11-18-15-rescan_01_09.RCC | USH | nocol | 1 | U937_SH |
| 33 | SpikeIn_cel-miR-254|0_MIMAT0000310 | Huh7_NO_spike_nocol_Oct | 6840 | 20151118_Espy-11-18-15-rescan_01_10.RCC | H7N | Huh 7 | Normoxia | Huh 7 Normox | None | + Spike In | 10.27.15 | H-2 | 10.28.15 | 11.15.15 | Oct | Huh7 | spike | NO | 20151118_Espy-11-18-15-rescan_01_10.RCC | H7N | nocol | 2 | Huh7_NO |
| 38 | SpikeIn_cel-miR-254|0_MIMAT0000310 | Huh7_IH_spike_nocol_Oct | 5883 | 20151118_Espy-11-18-15-rescan_01_11.RCC | H7IH | Huh 7 | Intermittent Hypoxia | Huh 7 Int Hypox | None | + Spike In | 10.27.15 | H-2 | 10.28.15 | 11.15.15 | Oct | Huh7 | spike | IH | 20151118_Espy-11-18-15-rescan_01_11.RCC | H7IH | nocol | 2 | Huh7_IH |
| 43 | SpikeIn_cel-miR-254|0_MIMAT0000310 | Huh7_SH_spike_nocol_Oct | 4708 | 20151118_Espy-11-18-15-rescan_01_12.RCC | H7SH | Huh 7 | Sustained Hypoxia | Huh 7 Sust hypox | None | + Spike In | 10.27.15 | H-2 | 10.28.15 | 11.15.15 | Oct | Huh7 | spike | SH | 20151118_Espy-11-18-15-rescan_01_12.RCC | H7SH | nocol | 2 | Huh7_SH |
| 48 | SpikeIn_cel-miR-254|0_MIMAT0000310 | U937_NO_spike_nocol_Jan | 1501 | 20160114_ES011416miRNAV3a_U937 N_01.RCC | UN | U937 | Normoxia | U937 Normox | None | + Spike In | 1.3.16 | U-2 | 1.4.16 | 1.20.16 | Jan | U937 | spike | NO | 20160114_ES011416miRNAV3a_U937 N_01.RCC | UN | nocol | 2 | U937_NO |
| 53 | SpikeIn_cel-miR-254|0_MIMAT0000310 | U937_IH_spike_nocol_Jan | 2333 | 20160114_ES011416miRNAV3a_U937 IH_02.RCC | UIH | U937 | Intermittent Hypoxia | U937 Int Hypox | None | + Spike In | 1.3.16 | U-2 | 1.4.16 | 1.20.16 | Jan | U937 | spike | IH | 20160114_ES011416miRNAV3a_U937 IH_02.RCC | UIH | nocol | 2 | U937_IH |
| 63 | SpikeIn_cel-miR-254|0_MIMAT0000310 | Huh7_NO_spike_oligo_Jan | 3587 | 20160114_ES011416miRNAV3a_H7 Nor5m_04.RCC | HN9 | Huh 7 | Normoxia | Huh 7 Normox | Oligo | + Spike In | 1.3.16 | H-3 | 1.4.16 | 1.20.16 | Jan | Huh7 | spike | NO | 20160114_ES011416miRNAV3a_H7 Nor5m_04.RCC | HN9 | oligo | 3 | Huh7_NO |
| 73 | SpikeIn_cel-miR-254|0_MIMAT0000310 | Huh7_SH_spike_reg_Jan | 1308 | 20160114_ES011416miRNAV3a_H7 SH_06.RCC | HSHR | Huh 7 | Sustained Hypoxia | Huh 7 Sust hypox | Regular | +Spike IN | 1.3.16 | H-3 | 1.4.16 | 1.20.16 | Jan | Huh7 | spike | SH | 20160114_ES011416miRNAV3a_H7 SH_06.RCC | HSHR | reg | 3 | Huh7_SH |
It looks like some of the ligation controls worked better than the other. Kit suggested just normalizing by the mean of the posA ligation control. What we will do instead is do the normalization we usually do, but include the ligation efficiency for each sample as a factor in the regression. Then we can test whether or not our normalization scheme is correcting for the ligation efficiency. It should be, if the result of the ligation efficiency is overall less miRNA.
spotbarplot(extract_pred(counts, is_ligation))
Here we calculated ligation efficiency for the samples. This is the log of LIG_POS_A(i) / mean(LIG_POS_A) for each sample i. It doesn’t matter which direction this is in, it will get corrected for in the model.
lig_posa = as.numeric(counts[grepl("Ligation_LIG_POS_A", rownames(counts)),])
metadata$lignorm = log2(lig_posa / mean(lig_posa))
ggplot(subset(metadata, line != "Blank"), aes(id, lignorm)) + geom_point() +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.x=element_text(size=8)) +
ylab("ligation factor") +
xlab("sample")
For a subset of the samples, one housekeeping gene is very high.
spotbarplot(extract_pred(counts, is_housekeeping))
These don’t look confined to any particular cell type. Why are some of these so different?
table(subset(extract_pred(counts, is_housekeeping), count > 1000)$sample)
##
## Huh7_IH_spike_nocol_Oct Huh7_NO_nospike_nocol_Dec
## 1 1
## Huh7_NO_spike_nocol_Oct Huh7_SH_nospike_nocol_Jan
## 3 1
## Huh7_SH_spike_nocol_Oct THP_IH_nospike_nocol_Aug
## 1 1
## THP_IH_nospike_nocol_Dec THP_IH_nospike_nocol_Jul
## 1 1
## THP_IH_nospike_nocol_Sep THP_NO_nospike_nocol_Aug
## 1 1
## THP_NO_nospike_nocol_Dec THP_NO_nospike_nocol_Jul
## 1 1
## THP_NO_nospike_nocol_Sep THP_SH_nospike_nocol_Aug
## 2 1
## THP_SH_nospike_nocol_Dec THP_SH_nospike_nocol_Jul
## 1 1
## THP_SH_nospike_nocol_Sep U937_IH_nospike_nocol_Aug
## 2 1
## U937_IH_nospike_nocol_Jul U937_IH_nospike_nocol_Nov
## 2 1
## U937_IH_spike_nocol_Jan U937_IH_spike_nocol_Oct
## 1 1
## U937_NO_nospike_nocol_Aug U937_NO_nospike_nocol_Jul
## 1 2
## U937_NO_nospike_nocol_Nov U937_NO_spike_nocol_Jan
## 1 1
## U937_NO_spike_nocol_Oct U937_SH_nospike_nocol_Aug
## 1 1
## U937_SH_nospike_nocol_Jul U937_SH_nospike_nocol_Nov
## 2 1
## U937_SH_spike_nocol_Jan U937_SH_spike_nocol_Oct
## 1 2
drop = is_spikein(rownames(counts))
drop = drop | is_positive(rownames(counts))
drop = drop | is_housekeeping(rownames(counts))
drop = drop | is_ligation(rownames(counts))
keep = counts[!drop,]
keep = keep[, !grepl("Blank", colnames(keep))]
metadata = subset(metadata, id %in% colnames(keep))
counts = keep
We’ll normalize the libraries and look at the negative control expression and then come up with a cutoff that is above the noise floor for the libraries.
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:reshape':
##
## expand, rename
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## colMeans, colSums, expand.grid, rowMeans, rowSums
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, regroup, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=~month+column+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds = estimateSizeFactors(dds)
ncounts = data.frame(counts(dds, normalized=TRUE))
Below we can see after normalizing by library size, if we set the cutoff to be 30, we are mostly above the noise floor measured by the negative controls:
ggplot(extract_pred(ncounts, is_negative), aes(count)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We used the threshold of 30 as the noise floor and dropped all" miRNA that did not have at least 30 counts in 20% of the samples.
keepgenes = rowSums(ncounts > 30) > 0.2*ncol(ncounts)
counts = counts[keepgenes,]
Here we fit two models. The first model we fit the full model, including the ligation normalization term in the model. ~month+column+line+hypoxia+lignorm. Then we fit a reduced model without the lignorm term to answer the question ‘is there miRNA that are affected by the ligation efficiency?’
Yes, it looks like even after normalization, if we fit a model with just the ligation efficiency that there are miRNA affected. So we should correct for that. Here you can see only a subset of the miRNA are affected; if we do it like this instead of just scaling the counts, we can correct for the specific miRNA that have counts that are correlated with the ligation efficiency.
dds = DESeqDataSetFromMatrix(counts, colData=metadata, design=~month+column+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
dds = DESeq(dds, full=~month+column+line+hypoxia+lignorm,
reduced=~month+column+line+hypoxia, test="LRT")
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
res = results(dds)
plotMA(res)
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(padj) %>% filter(padj < 0.05)
knitr::kable(subset(res, padj < 0.05))
| rowname | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj |
|---|---|---|---|---|---|---|
| Endogenous1_hsa-miR-146a-5p|0_MIMAT0000449 | 657.35125 | 0.7237097 | 0.1508606 | 23.395360 | 0.0000013 | 0.0001715 |
| Endogenous1_hsa-miR-302d-3p|0.006_MIMAT0000718 | 102.91760 | -0.5390867 | 0.1240328 | 19.495139 | 0.0000101 | 0.0006556 |
| Endogenous1_hsa-miR-548ar-5p|0.004_MIMAT0022265 | 39.47914 | -0.5077435 | 0.1242403 | 16.389052 | 0.0000516 | 0.0022352 |
| Endogenous1_hsa-miR-374b-5p|0_MIMAT0004955 | 72.55252 | 0.5847129 | 0.1523364 | 15.176599 | 0.0000979 | 0.0031821 |
| Endogenous1_hsa-miR-26a-5p|0_MIMAT0000082 | 132.79106 | 0.4393709 | 0.1173975 | 14.519217 | 0.0001387 | 0.0036072 |
| Endogenous1_hsa-miR-362-3p|0_MIMAT0004683 | 28.27474 | 0.4630149 | 0.1281661 | 13.423712 | 0.0002485 | 0.0053834 |
| Endogenous1_hsa-let-7c-5p|0_MIMAT0000064 | 33.92337 | -0.4690759 | 0.1360234 | 11.693007 | 0.0006274 | 0.0101945 |
| Endogenous1_hsa-miR-378e|0.021_MIMAT0018927 | 129.44567 | -0.4333283 | 0.1269200 | 11.724728 | 0.0006168 | 0.0101945 |
| Endogenous1_hsa-miR-19b-3p|0_MIMAT0000074 | 224.28645 | 0.5561009 | 0.1606614 | 10.935508 | 0.0009434 | 0.0131173 |
| Endogenous1_hsa-miR-28-3p|0_MIMAT0004502 | 31.40274 | 0.3815731 | 0.1184338 | 10.473747 | 0.0012108 | 0.0131173 |
| Endogenous1_hsa-miR-106b-5p|0_MIMAT0000680 | 104.92074 | 0.4505765 | 0.1362040 | 10.563968 | 0.0011531 | 0.0131173 |
| Endogenous1_hsa-miR-494-3p|0.099_MIMAT0002816 | 886.09046 | 0.3391129 | 0.1028740 | 10.540706 | 0.0011677 | 0.0131173 |
| Endogenous1_hsa-miR-148b-3p|0_MIMAT0000759 | 53.42077 | -0.3374903 | 0.1054467 | 10.135736 | 0.0014542 | 0.0145422 |
| Endogenous1_hsa-miR-612|0_MIMAT0003280 | 36.37244 | -0.3921924 | 0.1285868 | 9.484625 | 0.0020720 | 0.0192401 |
| Endogenous1_hsa-miR-30e-5p|0_MIMAT0000692 | 136.68859 | 0.4774734 | 0.1570820 | 9.315863 | 0.0022718 | 0.0196888 |
| Endogenous1_hsa-miR-29a-3p|0_MIMAT0000086 | 497.82160 | 0.3946110 | 0.1266899 | 9.089098 | 0.0025714 | 0.0208924 |
| Endogenous1_hsa-miR-20a-5p+hsa-miR-20b-5p|0_MIMAT0000075 | 302.90494 | 0.5366625 | 0.1785337 | 7.922923 | 0.0048812 | 0.0373271 |
Here we do PCA to look at how the samples cluster. We can see a clear separation along the 1st and 2nd PCA based on line, but within a cell line, the different hypoxic states overlap.
dds = DESeqDataSetFromMatrix(counts, colData=metadata,
design=~column+spiked+line+hypoxia+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## factor levels were dropped which had no samples
vst = varianceStabilizingTransformation(dds)
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
pca_loadings = function(object, ntop=500) {
rv <- matrixStats::rowVars(assay(object))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop,
length(rv)))]
pca <- prcomp(t(assay(object)[select, ]))
percentVar <- pca$sdev^2/sum(pca$sdev^2)
names(percentVar) = colnames(pca$x)
pca$percentVar = percentVar
return(pca)}
pc = pca_loadings(vst)
comps = data.frame(pc$x)
comps$Name = rownames(comps)
library(dplyr)
comps = comps %>% left_join(metadata, by=c("Name"="id"))
pca_plot = function(comps, nc1, nc2, colorby) {
c1str = paste0("PC", nc1)
c2str = paste0("PC", nc2)
ggplot(comps, aes_string(c1str, c2str, color=colorby)) +
geom_point() + theme_bw() +
xlab(paste0(c1str, ": ", round(pc$percentVar[nc1] * 100), "% variance")) +
ylab(paste0(c2str, ": ", round(pc$percentVar[nc2] * 100), "% variance"))
}
pca_plot(comps, 1, 2, "linehypo")
pca_plot(comps, 3, 4, "linehypo")
Nanostring data has an opposite mean-variance relationship than RNA-seq data. Here the dispersion is slightly higher for more highly expressed miRNA– the the trend is mostly flat, however.
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
plotDispEsts(dds)
Here we ask, controlling for the month, the column and the hypoxic state of the cell line, are there differences between the cell lines. The cell lines are very different from each other, most of the miRNA are significantly different between the cell lines.
res = results(dds, contrast=c("line", "Huh7", "U937"))
res = results(dds, contrast=c("line", "Huh7", "THP"))
res = results(dds, contrast=c("line", "THP", "U937"))
Here we are asking, controlling for the month, column and the cell line, are there differences due to hypoxic state?
res = results(dds, contrast=c("hypoxia", "NO", "IH"))
res = results(dds, contrast=c("hypoxia", "IH", "SH"))
res = results(dds, contrast=c("hypoxia", "NO", "SH"))
Here we refit the model with a new factor that is a combination of cell line and hypoxic state. Then we test if there are differences between the hypoxic states within each cell line.
metadata$linehypo = paste(metadata$line, metadata$hypoxia, sep="_")
dds = DESeqDataSetFromMatrix(counts, colData=metadata,
design=~column+spiked+linehypo+lignorm)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds, fitType='local')
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
We can se that the overall fold changes are fairly small relative to the standard error of the fold changes. Signal to noise ratio here is combined as the log fold change divided by the log fold change standard error.
write_results = function(res, fname) {
res = data.frame(res) %>% tibble::rownames_to_column() %>%
arrange(pvalue)
write.table(res, file=fname, quote=FALSE, col.names=TRUE,
row.names=FALSE, sep=",")
}
res = results(dds, contrast=c("linehypo", "THP_NO", "THP_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("THP NO vs IH")
write_results(res, "THP_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "THP_NO", "THP_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("THP NO vs SH")
write_results(res, "THP_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "THP_IH", "THP_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("THP IH vs SH")
write_results(res, "THP_IHvsSH.csv")
res = results(dds, contrast=c("linehypo", "Huh7_NO", "Huh7_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("Huh7 NO vs IH")
write_results(res, "Huh7_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "Huh7_NO", "Huh7_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("Huh7 NO vs SH")
write_results(res, "Huh7_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "Huh7_IH", "Huh7_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("Huh7 IH vs SH")
write_results(res, "Huh7_IHvsSH.csv")
res = results(dds, contrast=c("linehypo", "U937_NO", "U937_IH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("U937 NO vs IH")
write_results(res, "U937_NOvsIH.csv")
res = results(dds, contrast=c("linehypo", "U937_NO", "U937_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("U937 NO vs SH")
write_results(res, "U937_NOvsSH.csv")
res = results(dds, contrast=c("linehypo", "U937_IH", "U937_SH"), addMLE=TRUE)
ggplot(data.frame(res), aes(log2FoldChange, lfcMLE/lfcSE)) + geom_point() +
ylab("signal to noise ratio") + ggtitle("U937 IH vs SH")
write_results(res, "U937_IHvsSH.csv")
miR-210 is below the noise floor– one of the miR-365 spots though has higher expression values.
spotbarplot(extract_pred(counts, is_prior)) + scale_y_log10()
spotboxplot(extract_pred(counts, is_prior))